# -*- coding: utf-8 -*-
import random

import requests
import time
from bs4 import BeautifulSoup
from tomorrow import threads

from common.request import requestutil
from config import config

"""
1、抓取西刺代理网站的代理ip
2、并根据指定的目标url,对抓取到ip的有效性进行验证
3、最后存到指定的path
"""


# 采集代理
@threads(1)
def gather_proxys():
    global cache_proxys
    print("开始采集代理,当前数量:" + str(len(cache_proxys)))
    xici_proxys("http://www.xicidaili.com/nt/1")
    xici_proxys("http://www.xicidaili.com/nt/2")
    kuai_proxys("https://www.kuaidaili.com/free/inha/1/")
    kuai_proxys("https://www.kuaidaili.com/free/inha/2/")
    print("采集代理完成,当前数量:" + str(len(cache_proxys)))


# 采集代理
def kuai_proxys(url):
    html = requests.get(url=url, headers=requestutil.randomheader(), timeout=4).text
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find_all('tr')

    # 表头
    ths = []
    tds = trs[0].find_all('th')
    for td in tds:
        ths.append(td.text.strip())

    # 值
    cells = {}
    global cache_proxys
    for i in range(1, len(trs)):
        tr = trs[i]
        tds = tr.find_all('td')

        for i2 in range(0, len(ths)):
            cells[ths[i2]] = tds[i2].text.strip()
        IP地址 = cells["IP"]
        端口 = cells["PORT"]
        proxy_dict = {
            'http': 'http://' + IP地址 + ':' + 端口,
            'https': 'http://' + IP地址 + ':' + 端口
        }
        if validateIp(proxy_dict):
            cache_proxys.append(proxy_dict)
            print("快代理有效,当前数量:" + str(len(cache_proxys)))


# 西刺代理
def xici_proxys(url):
    html = requests.get(url=url, headers=requestutil.randomheader(), timeout=4).text
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find_all('tr')

    # 表头
    ths = []
    tds = trs[0].find_all('th')
    for td in tds:
        ths.append(td.text.strip())

    # 值
    cells = {}
    global cache_proxys
    for i in range(1, len(trs)):
        tr = trs[i]
        tds = tr.find_all('td')

        for i2 in range(0, len(ths)):
            cells[ths[i2]] = tds[i2].text.strip()
        IP地址 = cells["IP地址"]
        端口 = cells["端口"]
        proxy_dict = {
            'http': 'http://' + IP地址 + ':' + 端口,
            'https': 'http://' + IP地址 + ':' + 端口
        }
        if validateIp(proxy_dict):
            cache_proxys.append(proxy_dict)
            print("西刺代理有效,当前数量:" + str(len(cache_proxys)))


def validateIp(proxy_dict):
    url = 'http://www.baidu.com/s?wd=ip'
    try:
        response = requests.get(url, proxies=proxy_dict, timeout=3)
        if response.status_code == 200:
            return True
    except:
        pass
    return False


# 移除不可用的代理
def remove_bad_proxy(proxy):
    global cache_proxys
    for p in cache_proxys:
        if p == proxy:
            cache_proxys.remove(p)
            break


cache_proxys = []
cache_proxys_min_cnt = 0


# 随机使用代理
def get_proxy():
    global cache_proxys
    global cache_proxys_min_cnt
    index = random.randint(0, len(cache_proxys) - 1)
    return cache_proxys[index]


def init_proxy(cnt=10):
    global cache_proxys_min_cnt
    cache_proxys_min_cnt = cnt
    gather_proxys()
    wait_enguth_proxy()


def wait_enguth_proxy():
    global cache_proxys_min_cnt
    while True:
        if len(cache_proxys) < cache_proxys_min_cnt:
            print("未得到足够的代理,当前数量:" + str(len(cache_proxys)))
            time.sleep(5)
        else:
            print("=========================================")
            print("已得到足够的代理,当前数量:" + str(len(cache_proxys)))
            break


if __name__ == '__main__':
    gather_proxys()
